The aim is to combine label transfer and CNV inference to annotate Wilms tumor samples in SCPCP000006. The proposed annotation will be based on the combination of:
the label transfer from the fetal kidney reference (Stewart et
al.), in particular the fetal_kidney_predicted.compartment
and fetal_kidney:predicted.cell_type, as well as the
prediction.score for each compartment,
the predicted CNV calculated using intra-sample endothelial and
immune cells (--reference both) as normal
reference
In a second time, we will explore and validate the chosen annotation.
We will use some of the markers genes to validate visually the annotations.
The analysis can be summarized as the following:
Where cnv.thr and pred.thr need to be
discussed
| first level annotation | second level annotation | selection of the cells | marker genes for validation | cnv validation |
|---|---|---|---|---|
| normal | endothelial | compartment == “endothelium” & predicted.score > pred.thr & cnv_score < cnv.thr | WVF | no cnv |
| normal | immune | compartment == “immune” & predicted.score > pred.thr & cnv_score < cnv.thr | PTPRC, CD163, CD68 | no cnv |
| normal | kidney | cell_type %in% c(“kidney cell”, “kidney epithelial”, “podocyte”) & predicted.score > pred.thr & cnv_score < cnv.thr | CDH1, PODXL, LTL | no cnv |
| normal | stroma | compartment == “stroma” & predicted.score > pred.thr & cnv_score < cnv.thr | VIM | no cnv |
| cancer | stroma | compartment == “stroma” & cnv_score > cnv.thr | VIM | proportion_cnv_chr -1 -4 -11 -16 -17 -18 |
| cancer | blastema | compartment == “fetal_nephron” & cell_type == “mesenchymal cell” & cnv_score > cnv.thr | CITED1 | proportion_cnv_chr -1 -4 -11 -16 -17 -18 |
| cancer | epithelial | compartment == “fetal_nephron” & cell_type != “mesenchymal cell” & cnv_score > cnv.thr | CDH1 | proportion_cnv_chr -1 -4 -11 -16 -17 -18 |
| unknown | - | the rest of the cells | - | proportion_cnv_chr -1 -4 -11 -16 -17 -18 |
# The base path for the OpenScPCA repository, found by its (hidden) .git directory
repository_base <- rprojroot::find_root(rprojroot::is_git_root)
# The current data directory, found within the repository base directory
data_dir <- file.path(repository_base, "data", "current", "SCPCP000006")
# The path to this module
module_base <- file.path(repository_base, "analyses", "cell-type-wilms-tumor-06")
result_dir <- file.path(module_base, "results")In this notebook, we are working with all of the samples in SCPCP000006.
The sample metadata can be found in sample_metadata_file
in the data folder.
We extracted from the pre-processed and labeled Seurat
object from
results/{sample_id}/06_infercnv_HMM-i3_SCPCS000169_reference-both.rds.
the following information per cell:
the predicted compartment in
fetal_kidney_predicted.compartment and
fetal_kidney_predicted.compartment.score
the predicted compartment in
fetal_kidney_predicted.cell_type and
fetal_kidney_predicted.cell_type.score
the sample identifier in sample_id
the proportion of estimated CNV per chromosome, {i} in 1 to 22
proportion_cnv_chr{i}
the countsof markers
genes
sample_metadata_file <- file.path(repository_base, "data", "current", "SCPCP000006", "single_cell_metadata.tsv")
metadata <- read.table(sample_metadata_file, sep = "\t", header = TRUE)
sample_ids <- metadata |>
dplyr::filter(seq_unit != "spot") |>
dplyr::pull(scpca_sample_id) |>
unique()
# Create a data frames of all annotations
cell_type_df <- sample_ids |>
purrr::map(
# For each sample_id, do the following:
\(sample_id) {
# This sample was run with "none" for reference
if (sample_id == "SCPCS000190") {
reference <- "none"
} else {
reference <- "both"
}
input_file <- file.path(
result_dir,
sample_id,
glue::glue("06_infercnv_HMM-i3_{sample_id}_reference-{reference}.rds")
)
# Read in the Seurat object
srat <- readRDS(input_file)
# Create and return a data frame from the Seurat object with relevant annotations
# this data frame will have four columns: barcode, sample_id, compartment, organ
data.frame(
# label transfer from the fetal kidney reference
cell_type = srat$fetal_kidney_predicted.cell_type,
compartment = srat$fetal_kidney_predicted.compartment,
# predicted.scores from the label transfer from the fetal kidney reference
cell_type.score = srat$fetal_kidney_predicted.cell_type.score,
compartment.score = srat$fetal_kidney_predicted.compartment.score,
# cell embedding
umap = srat@reductions$umap@cell.embeddings,
# marker genes
PTPRC = FetchData(object = srat, vars = "ENSG00000081237", layer = "counts"),
VWF = FetchData(object = srat, vars = "ENSG00000110799", layer = "counts"),
VIM = FetchData(object = srat, vars = "ENSG00000026025", layer = "counts"),
CITED1 = FetchData(object = srat, vars = "ENSG00000125931", layer = "counts"),
CDH1 = FetchData(object = srat, vars = "ENSG00000039068", layer = "counts"),
PODXL = FetchData(object = srat, vars = "ENSG00000128567", layer = "counts"),
COL6A3 = FetchData(object = srat, vars = "ENSG00000163359", layer = "counts"),
SIX2 = FetchData(object = srat, vars = "ENSG00000170577", layer = "counts"),
NCAM1 = FetchData(object = srat, vars = "ENSG00000149294", layer = "counts"),
THY1 = FetchData(object = srat, vars = "ENSG00000154096", layer = "counts"),
# proportion of cnv per chromosome
proportion_cnv_chr1 = srat$proportion_cnv_chr1,
proportion_cnv_chr2 = srat$proportion_cnv_chr2,
proportion_cnv_chr3 = srat$proportion_cnv_chr3,
proportion_cnv_chr4 = srat$proportion_cnv_chr4,
proportion_cnv_chr5 = srat$proportion_cnv_chr5,
proportion_cnv_chr6 = srat$proportion_cnv_chr6,
proportion_cnv_chr7 = srat$proportion_cnv_chr7,
proportion_cnv_chr8 = srat$proportion_cnv_chr8,
proportion_cnv_chr9 = srat$proportion_cnv_chr9,
proportion_cnv_chr10 = srat$proportion_cnv_chr10,
proportion_cnv_chr11 = srat$proportion_cnv_chr11,
proportion_cnv_chr12 = srat$proportion_cnv_chr12,
proportion_cnv_chr13 = srat$proportion_cnv_chr13,
proportion_cnv_chr14 = srat$proportion_cnv_chr14,
proportion_cnv_chr15 = srat$proportion_cnv_chr15,
proportion_cnv_chr16 = srat$proportion_cnv_chr16,
proportion_cnv_chr17 = srat$proportion_cnv_chr17,
proportion_cnv_chr18 = srat$proportion_cnv_chr18,
proportion_cnv_chr19 = srat$proportion_cnv_chr19,
proportion_cnv_chr20 = srat$proportion_cnv_chr20,
proportion_cnv_chr21 = srat$proportion_cnv_chr21,
proportion_cnv_chr22 = srat$proportion_cnv_chr22,
# cnv global estimation per chromosome
has_cnv_chr1 = srat$has_cnv_chr1,
has_cnv_chr2 = srat$has_cnv_chr2,
has_cnv_chr3 = srat$has_cnv_chr3,
has_cnv_chr4 = srat$has_cnv_chr4,
has_cnv_chr5 = srat$has_cnv_chr5,
has_cnv_chr6 = srat$has_cnv_chr6,
has_cnv_chr7 = srat$has_cnv_chr7,
has_cnv_chr8 = srat$has_cnv_chr8,
has_cnv_chr9 = srat$has_cnv_chr9,
has_cnv_chr10 = srat$has_cnv_chr10,
has_cnv_chr11 = srat$has_cnv_chr11,
has_cnv_chr12 = srat$has_cnv_chr12,
has_cnv_chr13 = srat$has_cnv_chr13,
has_cnv_chr14 = srat$has_cnv_chr14,
has_cnv_chr15 = srat$has_cnv_chr15,
has_cnv_chr16 = srat$has_cnv_chr16,
has_cnv_chr17 = srat$has_cnv_chr17,
has_cnv_chr18 = srat$has_cnv_chr18,
has_cnv_chr19 = srat$has_cnv_chr19,
has_cnv_chr20 = srat$has_cnv_chr20,
has_cnv_chr21 = srat$has_cnv_chr21,
has_cnv_chr22 = srat$has_cnv_chr22
) |>
tibble::rownames_to_column("barcode") |>
dplyr::mutate(sample_id = sample_id)
}
) |>
# now combine all dataframes to make one big one
dplyr::bind_rows()The report will be saved in the notebook directory.
do_Feature_mean shows heatmap of mean expression of a
feature grouped by a metadata.
df is the name of the table containing metadata and
feature expression (counts) per cellsgroup.by is the name of the metadata to group the
cellsfeature is the name of the gene to average the
expressiondo_Feature_mean <- function(df, group.by, feature) {
df <- df %>%
group_by(sample_id, !!sym(group.by)) %>%
summarise(m = mean(!!sym(feature)))
p <- ggplot(df, aes(x = sample_id, y = !!sym(group.by), fill = m)) +
geom_tile() +
scale_fill_viridis_c() +
theme_bw() +
theme(text = element_text(size = 20)) +
theme(axis.text.x = element_text(angle = 90, hjust = 0.5, vjust = 0.5)) +
guides(fill = guide_colourbar(title = paste0(feature)))
return(p)
}As done in 06_cnv_infercnv_exploration.Rmd, we calculate
single CNV score and assess its potential in identifying cells with CNV
versus normal cells without CNV.
We simply checked for each chromosome if the cell
has_cnv_chr. Would the cell have more than
cnv_threshold chromosome with CNV, the global
has_cnv_score will be TRUE. Else, the cell will have a
has_cnv_score set to FALSE.
cell_type_df <- cell_type_df |>
mutate(has_cnv_score = rowSums(cell_type_df[, grepl("has_cnv_chr", colnames(cell_type_df))])) |>
mutate(has_cnv_score = case_when(
has_cnv_score > params$cnv_threshold ~ "CNV",
has_cnv_score <= params$cnv_threshold ~ "no CNV"
))
table(cell_type_df$has_cnv_score)CNV no CNV 190793 9545
At first, we like to indicate in the
first.level_annotation if a cell is normal, cancer or
unknown.
endothelium, immune, stroma or
fetal nephron) and do not have cnv. We only allow a bit of
flexibility in terms of cnv profile for immune and endothelium cells
that have a high predicted score. Indeed, we know that false positive
cnv can be observed in a cell type specific manner.The threshold used for the predicted.score is defined as
a parameter of this notebook as 0.85. The threshold used for the
identification of cnv is also defined in the params of the notebook as
0.
stroma or
fetal nephron compartments and must have at least few
cnv.# Define normal cells
# We first pick up the immune and endothelial cells annotated via the label transfer compartments under the condition that the predicted score is above the threshold
cell_type_df <- cell_type_df |>
mutate(first.level_annotation = case_when(
# assign normal/cancer based on condition
compartment %in% c("fetal_nephron", "stroma") & has_cnv_score == "no CNV" ~ "normal",
compartment %in% c("endothelium", "immune") & cell_type.score > params$predicted.celltype.threshold ~ "normal",
compartment %in% c("fetal_nephron", "stroma") & has_cnv_score == "CNV" ~ "cancer",
.default = "unknown"
))Using this basic strategy, we identified 190303 cancer cells, 8088 normal cells. Only 1947cells remain unknown
ggplot(cell_type_df, aes(x = umap.umap_1, y = umap.umap_2, color = first.level_annotation), shape = 19, size = 1) +
geom_point() +
facet_wrap(facets = ~sample_id, ncol = 5) +
theme_bw() +
theme(text = element_text(size = 22))Normal cells from the fetal nephron compartment must
be normal kidney cells.
Normal cells from the stroma compartment must be
normal stroma cells.
Immune and endothelial cells have been already identified by label transfer.
Wilms tumor cancer cells can be:
cancer stroma: We define as cancer stroma all cancer cells from the stroma compartment.
blastema,: we defined as bastema every cancer
cell that has a
fetal_kidney_predicted.cell_type == mesenchymal cell. We
know that these mesenchymal cells are cells from the cap
mesenchyme that are not expected to be in a mature kidney. These
blastema cells should express higher CITED1.
cancer epithelium: we defined as cancer epithelium all cancer cells that are neither stroma nor blastemal cells. We expect these cells to express epithelial markers. Their predicted cell type should correspond to more mature kidney epithelial subunits.
cell_type_df <- cell_type_df |>
mutate(second.level_annotation = case_when(
# assign normal cells based on condition
cell_type_df$compartment %in% c("fetal_nephron") &
cell_type_df$has_cnv_score == "no CNV" ~ "kidney",
cell_type_df$compartment %in% c("stroma") &
cell_type_df$has_cnv_score == "no CNV" ~ "normal stroma",
cell_type_df$compartment %in% c("endothelium") &
(cell_type_df$compartment.score > params$predicted.celltype.threshold |
cell_type_df$has_cnv_score == "no CNV") ~ "endothelium",
cell_type_df$compartment %in% c("immune") &
(cell_type_df$compartment.score > params$predicted.celltype.threshold |
cell_type_df$has_cnv_score == "no CNV") ~ "immune",
# assign cancer cells based on condition
cell_type_df$compartment %in% c("stroma") &
cell_type_df$has_cnv_score == "CNV" ~ "cancer stroma",
cell_type_df$compartment %in% c("fetal_nephron") &
cell_type_df$has_cnv_score == "CNV" &
cell_type_df$cell_type == "mesenchymal cell" ~ "blastema",
cell_type_df$compartment %in% c("fetal_nephron") &
cell_type_df$has_cnv_score == "CNV" &
cell_type_df$cell_type != "mesenchymal cell" ~ "cancer epithelium",
.default = "unknown"
))ggplot(cell_type_df[cell_type_df$first.level_annotation == "normal", ], aes(x = umap.umap_1, y = umap.umap_2, color = second.level_annotation), shape = 19, size = 1) +
geom_point() +
facet_wrap(facets = ~sample_id, ncol = 5) +
theme_bw() +
theme(text = element_text(size = 22))ggplot(cell_type_df[cell_type_df$first.level_annotation == "cancer", ], aes(x = umap.umap_1, y = umap.umap_2, color = second.level_annotation), shape = 19, size = 0.1) +
geom_point() +
facet_wrap(facets = ~sample_id, ncol = 5) +
theme_bw() +
theme(text = element_text(size = 22))for (i in 1:22) {
print(do_Feature_mean(cell_type_df, group.by = "second.level_annotation", feature = glue::glue("proportion_cnv_chr", i)))
}This section creates the cell type annotation table for export.
annotations_table <- cell_type_df |>
select(
cell_barcode = barcode,
scpca_sample_id = sample_id,
tumor_cell_classification = first.level_annotation,
cell_type_assignment = second.level_annotation
) |>
mutate(
# change cancer --> tumor, but keep the other labels
tumor_cell_classification = ifelse(
tumor_cell_classification == "cancer", "tumor", tumor_cell_classification
),
cell_type_assignment = str_replace_all(
cell_type_assignment,
"cancer ",
"tumor "
)
)
write_tsv(annotations_table, annotations_tsv)Confirm how many samples we have annotations for:
## [1] 40
Combining label transfer and CNV inference we have produced draft annotations for all 40 Wilms tumor samples in SCPCP000006
The heatmaps of cnv proportion and marker genes support our annotations, but signals with some marker genes are very low. Also, there is no universal marker for each entity of Wilms tumor that cover all tumor cells from all patient. This makes the validation of the annotations quite difficult.
However, we could try to take the problem from the other side, and used the current annotation to perform differential expression analysis and try to find marker genes that are consistent across patient and Wilms tumor histologies.
In each histology (i.e. epithelial and stroma), the distinction between cancer and non cancer cell is difficult (as expected). In this analysis, we suggested to rely on the cnv score to assess the normality of the cell. Here again, we could try to run differential expression analysis and compare epithelial (resp. stroma) cancer versus non-cancer cells across patient, aiming to find a share transcriptional program allowing the classification cancer versus normal.
In our annotation, we haven’t taken into account the favorable/anaplastic status of the sample. However, as anaplasia can occur in every (but do not has to) wilms tumor histology, I am not sure how to integrate the information into the annotation.
This notebook could be finally rendered using different parameters, i.e. threshold for the cnv score and predicted score to use.
# record the versions of the packages used in this analysis and other environment information
sessionInfo()## R version 4.4.1 (2024-06-14)
## Platform: aarch64-apple-darwin20
## Running under: macOS 15.1
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/4.4-arm64/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.4-arm64/Resources/lib/libRlapack.dylib; LAPACK version 3.12.0
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## time zone: America/New_York
## tzcode source: internal
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] DT_0.33 patchwork_1.2.0 lubridate_1.9.3 forcats_1.0.0
## [5] stringr_1.5.1 dplyr_1.1.4 purrr_1.0.2 readr_2.1.5
## [9] tidyr_1.3.1 tibble_3.2.1 ggplot2_3.5.1 tidyverse_2.0.0
## [13] Seurat_5.1.0 SeuratObject_5.0.2 sp_2.1-4
##
## loaded via a namespace (and not attached):
## [1] RColorBrewer_1.1-3 rstudioapi_0.16.0 jsonlite_1.8.8
## [4] magrittr_2.0.3 spatstat.utils_3.1-0 farver_2.1.2
## [7] rmarkdown_2.28 vctrs_0.6.5 ROCR_1.0-11
## [10] spatstat.explore_3.3-2 htmltools_0.5.8.1 sass_0.4.9
## [13] sctransform_0.4.1 parallelly_1.38.0 KernSmooth_2.23-24
## [16] bslib_0.8.0 htmlwidgets_1.6.4 ica_1.0-3
## [19] plyr_1.8.9 plotly_4.10.4 zoo_1.8-12
## [22] cachem_1.1.0 igraph_2.0.3 mime_0.12
## [25] lifecycle_1.0.4 pkgconfig_2.0.3 Matrix_1.7-0
## [28] R6_2.5.1 fastmap_1.2.0 fitdistrplus_1.2-1
## [31] future_1.34.0 shiny_1.9.1 digest_0.6.37
## [34] colorspace_2.1-1 rprojroot_2.0.4 tensor_1.5
## [37] RSpectra_0.16-2 irlba_2.3.5.1 labeling_0.4.3
## [40] progressr_0.14.0 fansi_1.0.6 spatstat.sparse_3.1-0
## [43] timechange_0.3.0 httr_1.4.7 polyclip_1.10-7
## [46] abind_1.4-5 compiler_4.4.1 bit64_4.0.5
## [49] withr_3.0.1 fastDummies_1.7.4 highr_0.11
## [52] MASS_7.3-61 tools_4.4.1 lmtest_0.9-40
## [55] httpuv_1.6.15 future.apply_1.11.2 goftest_1.2-3
## [58] glue_1.7.0 nlme_3.1-166 promises_1.3.0
## [61] grid_4.4.1 Rtsne_0.17 cluster_2.1.6
## [64] reshape2_1.4.4 generics_0.1.3 gtable_0.3.5
## [67] spatstat.data_3.1-2 tzdb_0.4.0 data.table_1.16.0
## [70] hms_1.1.3 utf8_1.2.4 spatstat.geom_3.3-2
## [73] RcppAnnoy_0.0.22 ggrepel_0.9.5 RANN_2.6.2
## [76] pillar_1.9.0 vroom_1.6.5 spam_2.10-0
## [79] RcppHNSW_0.6.0 later_1.3.2 splines_4.4.1
## [82] lattice_0.22-6 bit_4.0.5 survival_3.7-0
## [85] deldir_2.0-4 tidyselect_1.2.1 miniUI_0.1.1.1
## [88] pbapply_1.7-2 knitr_1.48 gridExtra_2.3
## [91] scattermore_1.2 xfun_0.47 matrixStats_1.3.0
## [94] stringi_1.8.4 lazyeval_0.2.2 yaml_2.3.10
## [97] evaluate_0.24.0 codetools_0.2-20 cli_3.6.3
## [100] uwot_0.2.2 xtable_1.8-4 reticulate_1.38.0
## [103] munsell_0.5.1 jquerylib_0.1.4 Rcpp_1.0.13
## [106] globals_0.16.3 spatstat.random_3.3-1 png_0.1-8
## [109] spatstat.univar_3.0-0 parallel_4.4.1 dotCall64_1.1-1
## [112] listenv_0.9.1 viridisLite_0.4.2 scales_1.3.0
## [115] ggridges_0.5.6 crayon_1.5.3 leiden_0.4.3.1
## [118] rlang_1.1.4 cowplot_1.1.3